{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example: Optimizing Data Extraction (NER) with TensorZero"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import json\n",
    "from collections import Counter\n",
    "from typing import Dict, List, Optional\n",
    "\n",
    "import altair as alt\n",
    "import pandas as pd\n",
    "from tensorzero import AsyncTensorZeroGateway, InferenceResponse\n",
    "from tqdm import tqdm\n",
    "from tqdm.asyncio import tqdm_asyncio"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **IMPORTANT:** Update the gateway URL below if you're not using the standard setup provided in this example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TENSORZERO_GATEWAY_URL = \"http://localhost:3000\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select only a subset of the dataset to speed things up\n",
    "NUM_TRAIN_DATAPOINTS = 500\n",
    "NUM_VAL_DATAPOINTS = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(path: str) -> (pd.DataFrame, pd.DataFrame):\n",
    "    # Load the dataset\n",
    "    df = pd.read_csv(path)\n",
    "    df.output = df.output.apply(json.loads)\n",
    "\n",
    "    # Split the dataset into train and validation sets\n",
    "    train_df = df[df[\"split\"] == 0]\n",
    "    val_df = df[df[\"split\"] == 1]\n",
    "\n",
    "    # Shuffle the splits\n",
    "    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)\n",
    "    val_df = val_df.sample(frac=1, random_state=0).reset_index(drop=True)\n",
    "\n",
    "    # Select only a subset of the dataset to speed things up\n",
    "    train_df = train_df.iloc[:NUM_TRAIN_DATAPOINTS]\n",
    "    val_df = val_df.iloc[:NUM_VAL_DATAPOINTS]\n",
    "\n",
    "    return train_df, val_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, val_df = load_dataset(\"data/conllpp.csv\")\n",
    "\n",
    "print(f\"Train data shape: {train_df.shape}\")\n",
    "print(f\"Validation data shape: {val_df.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract Entities"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **IMPORTANT:** Reduce the number of concurrent requests if you're running into rate limits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_CONCURRENT_REQUESTS = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tensorzero_client = await AsyncTensorZeroGateway.build_http(gateway_url=TENSORZERO_GATEWAY_URL, timeout=15)\n",
    "semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async def get_entities(\n",
    "    text: str,\n",
    "    variant_name: Optional[str] = None,\n",
    "    dryrun: bool = False,\n",
    ") -> Optional[InferenceResponse]:\n",
    "    # Use a semaphore to avoid rate limits\n",
    "    async with semaphore:\n",
    "        try:\n",
    "            return await tensorzero_client.inference(\n",
    "                function_name=\"extract_entities\",\n",
    "                input={\"messages\": [{\"role\": \"user\", \"content\": text}]},\n",
    "                dryrun=dryrun,\n",
    "                variant_name=variant_name,\n",
    "            )\n",
    "        except Exception as e:\n",
    "            print(f\"Error occurred: {type(e).__name__}: {e}\")\n",
    "            return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run inference in parallel to speed things up\n",
    "responses = await tqdm_asyncio.gather(*[get_entities(text) for text in train_df[\"input\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def flatten_dict(d: Dict[str, List[str]]) -> List[str]:\n",
    "    res = []\n",
    "    for k, v in d.items():\n",
    "        assert isinstance(v, list)\n",
    "        for elt in v:\n",
    "            res.append(f\"__{k.upper()}__::{elt}\")\n",
    "    return res\n",
    "\n",
    "\n",
    "# Exact match between the predicted and ground truth entities (the sharpest metric we use to evaluate NER)\n",
    "def compute_exact_match(predicted: Dict[str, List[str]], ground_truth: Dict[str, List[str]]) -> bool:\n",
    "    return set(flatten_dict(predicted)) == set(flatten_dict(ground_truth))\n",
    "\n",
    "\n",
    "# Jaccard similarity between the predicted and ground_truth entities\n",
    "# (a more lenient metric that gives partial credit for correct entities)\n",
    "# This is a different implementation from the original code by Predibase, so the metrics won't be directly comparable.\n",
    "def compute_jaccard_similarity(predicted: Dict[str, List[str]], ground_truth: Dict[str, List[str]]) -> float:\n",
    "    target_entities = flatten_dict(ground_truth)\n",
    "    pred_entities = flatten_dict(predicted)\n",
    "    target_count = Counter(target_entities)\n",
    "    pred_count = Counter(pred_entities)\n",
    "    num = 0\n",
    "    den = 0\n",
    "    all_keys = set(target_entities).union(set(pred_entities))\n",
    "    for key in all_keys:\n",
    "        num += min(target_count.get(key, 0), pred_count.get(key, 0))\n",
    "        den += max(target_count.get(key, 0), pred_count.get(key, 0))\n",
    "    if den == 0:\n",
    "        return 1\n",
    "    return num / den"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_response(response: Optional[InferenceResponse], ground_truth_data: Dict[str, List[str]]):\n",
    "    predicted = response.output.parsed if response else None\n",
    "\n",
    "    # `predicted` is None if the model failed to return a valid JSON that complies with the output schema\n",
    "    valid_output = predicted is not None\n",
    "\n",
    "    # Compute the other metrics\n",
    "    exact_match = compute_exact_match(predicted, ground_truth_data) if predicted else False\n",
    "    jaccard_similarity = compute_jaccard_similarity(predicted, ground_truth_data) if predicted else 0\n",
    "\n",
    "    return valid_output, exact_match, jaccard_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for response, ground_truth in tqdm(zip(responses, train_df[\"output\"]), total=len(responses)):\n",
    "    # Don't send feedback if the request failed completely\n",
    "    if response is None:\n",
    "        continue\n",
    "\n",
    "    # Evaluate the example\n",
    "    valid_output, exact_match, jaccard_similarity = evaluate_response(response, ground_truth)\n",
    "\n",
    "    # Send the metrics feedback to TensorZero\n",
    "    await tensorzero_client.feedback(\n",
    "        metric_name=\"valid_output\",\n",
    "        value=valid_output,\n",
    "        inference_id=response.inference_id,\n",
    "    )\n",
    "\n",
    "    await tensorzero_client.feedback(\n",
    "        metric_name=\"exact_match\",\n",
    "        value=exact_match,\n",
    "        inference_id=response.inference_id,\n",
    "    )\n",
    "\n",
    "    await tensorzero_client.feedback(\n",
    "        metric_name=\"jaccard_similarity\",\n",
    "        value=jaccard_similarity,\n",
    "        inference_id=response.inference_id,\n",
    "    )\n",
    "\n",
    "    # Send the demonstration feedback to TensorZero\n",
    "    await tensorzero_client.feedback(\n",
    "        metric_name=\"demonstration\",\n",
    "        value=ground_truth,\n",
    "        inference_id=response.inference_id,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation Set"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **IMPORTANT:** Update the list below when you create new variants in `tensorzero.toml`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Include the variants in `tensorzero.toml` that we want to evaluate\n",
    "VARIANTS_TO_EVALUATE = [\n",
    "    \"gpt_4o\",\n",
    "    \"gpt_4o_mini\",\n",
    "    # \"gpt_4o_mini_fine_tuned\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = {}  # variant_name => (valid_output, exact_match, jaccard_similarity)\n",
    "\n",
    "for variant_name in VARIANTS_TO_EVALUATE:\n",
    "    # Run inference on the validation set\n",
    "    responses = await tqdm_asyncio.gather(\n",
    "        *[\n",
    "            get_entities(\n",
    "                text,\n",
    "                variant_name=variant_name,  # pin to the specific variant we want to evaluate\n",
    "                dryrun=True,  # don't store results to avoid leaking data\n",
    "            )\n",
    "            for text in val_df[\"input\"]\n",
    "        ],\n",
    "        desc=f\"Evaluating variant: {variant_name}\",\n",
    "    )\n",
    "\n",
    "    # Evaluate the performance of the variant\n",
    "    valid_output_scores = []\n",
    "    exact_match_scores = []\n",
    "    jaccard_similarity_scores = []\n",
    "\n",
    "    for response, ground_truth in zip(responses, val_df[\"output\"]):\n",
    "        valid_output, exact_match, jaccard_similarity = evaluate_response(response, ground_truth)\n",
    "        valid_output_scores.append(valid_output)\n",
    "        exact_match_scores.append(exact_match)\n",
    "        jaccard_similarity_scores.append(jaccard_similarity)\n",
    "\n",
    "    scores[variant_name] = {\n",
    "        \"valid_output\": valid_output_scores,\n",
    "        \"exact_match\": exact_match_scores,\n",
    "        \"jaccard_similarity\": jaccard_similarity_scores,\n",
    "    }\n",
    "\n",
    "    # Print the performance of the variant\n",
    "    print(f\"Valid Output: {sum(valid_output_scores) / len(valid_output_scores):.1%}\")\n",
    "    print(f\"Exact Match: {sum(exact_match_scores) / len(exact_match_scores):.1%}\")\n",
    "    print(f\"Jaccard Similarity (mean): {sum(jaccard_similarity_scores) / len(jaccard_similarity_scores):.1%}\")\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plot Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores_df = []\n",
    "\n",
    "for variant_name, variant_scores in scores.items():\n",
    "    exact_match_score = sum(variant_scores[\"exact_match\"]) / len(variant_scores[\"exact_match\"])\n",
    "    scores_df.append(\n",
    "        {\n",
    "            \"Variant\": variant_name,\n",
    "            \"Metric\": \"exact_match\",\n",
    "            \"Score\": exact_match_score,\n",
    "        }\n",
    "    )\n",
    "\n",
    "    jaccard_similarity_score = sum(variant_scores[\"jaccard_similarity\"]) / len(variant_scores[\"jaccard_similarity\"])\n",
    "\n",
    "    scores_df.append(\n",
    "        {\n",
    "            \"Variant\": variant_name,\n",
    "            \"Metric\": \"jaccard_similarity\",\n",
    "            \"Score\": jaccard_similarity_score,\n",
    "        }\n",
    "    )\n",
    "\n",
    "scores_df = pd.DataFrame(scores_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chart = (\n",
    "    alt.Chart(scores_df)\n",
    "    .encode(\n",
    "        x=alt.X(\"Score:Q\", axis=alt.Axis(format=\"%\"), scale=alt.Scale(domain=[0, 1])),\n",
    "        y=\"Variant:N\",\n",
    "        yOffset=\"Metric:N\",\n",
    "        color=\"Metric:N\",\n",
    "        text=alt.Text(\"Score:Q\", format=\".1%\"),\n",
    "    )\n",
    "    .properties(title=\"Metrics by Variant\")\n",
    ")\n",
    "\n",
    "chart = chart.mark_bar() + chart.mark_text(align=\"left\", dx=2)\n",
    "\n",
    "chart"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
